{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8f66d95-a9c4-40f1-8cf8-19795653c3f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install sycamore-ai[elasticsearch]\n",
    "# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60b49e1c-7055-4534-ac09-8b7ab45086d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sycamore\n",
    "from sycamore.context import ExecMode\n",
    "from sycamore.transforms.partition import ArynPartitioner\n",
    "from sycamore.transforms.extract_schema import LLMPropertyExtractor\n",
    "from sycamore.transforms.summarize_images import SummarizeImages, LLMImageSummarizer\n",
    "from sycamore.transforms.standardizer import (\n",
    "    USStateStandardizer,\n",
    "    DateTimeStandardizer,\n",
    "    ignore_errors,\n",
    ")\n",
    "from sycamore.transforms.merge_elements import GreedySectionMerger\n",
    "from sycamore.functions.tokenizer import HuggingFaceTokenizer\n",
    "from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
    "from sycamore.llms import OpenAI, OpenAIModels\n",
    "\n",
    "import pyarrow.fs\n",
    "\n",
    "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n",
    "os.environ[\"ARYN_API_KEY\"] = \"<MY-ARYN-API-KEY>\"\n",
    "\n",
    "paths = [\"s3://aryn-public/ntsb/\"]\n",
    "\n",
    "context = sycamore.init()\n",
    "# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n",
    "docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n",
    "docset = docset.materialize(\n",
    "    path=\"./elasticsearch-tutorial/downloaded-docset\",\n",
    "    source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
    ")\n",
    "# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n",
    "partitioned_docset = docset.partition(\n",
    "    partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True)\n",
    ").materialize(\n",
    "    path=\"./elasticsearch-tutorial/partitioned-docset\",\n",
    "    source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
    ")\n",
    "partitioned_docset.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a755a09e-1622-400b-8b75-b3bad2981b5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "schema = {\n",
    "    \"type\": \"object\",\n",
    "    \"properties\": {\n",
    "        \"accidentNumber\": {\"type\": \"string\"},\n",
    "        \"dateAndTime\": {\"type\": \"date\"},\n",
    "        \"location\": {\n",
    "            \"type\": \"string\",\n",
    "            \"description\": \"US State where the incident occured\",\n",
    "        },\n",
    "        \"aircraft\": {\"type\": \"string\"},\n",
    "        \"aircraftDamage\": {\"type\": \"string\"},\n",
    "        \"injuries\": {\"type\": \"string\"},\n",
    "        \"definingEvent\": {\"type\": \"string\"},\n",
    "    },\n",
    "    \"required\": [\"accidentNumber\", \"dateAndTime\", \"location\", \"aircraft\"],\n",
    "}\n",
    "\n",
    "schema_name = \"FlightAccidentReport\"\n",
    "property_extractor = LLMPropertyExtractor(\n",
    "    llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema\n",
    ")\n",
    "\n",
    "enriched_docset = (\n",
    "    partitioned_docset\n",
    "    # Extracts the properties based on the schema defined\n",
    "    .extract_properties(property_extractor=property_extractor)\n",
    "    # Summarizes images that were extracted using an LLM\n",
    "    .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n",
    ")\n",
    "\n",
    "formatted_docset = (\n",
    "    enriched_docset\n",
    "    # Converts state abbreviations to their full names.\n",
    "    .map(\n",
    "        lambda doc: ignore_errors(\n",
    "            doc, USStateStandardizer, [\"properties\", \"entity\", \"location\"]\n",
    "        )\n",
    "    )\n",
    "    # Converts datetime into a common format\n",
    "    .map(\n",
    "        lambda doc: ignore_errors(\n",
    "            doc, DateTimeStandardizer, [\"properties\", \"entity\", \"dateAndTime\"]\n",
    "        )\n",
    "    )\n",
    ")\n",
    "\n",
    "\n",
    "merger = GreedySectionMerger(\n",
    "    tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"),\n",
    "    max_tokens=512,\n",
    ")\n",
    "chunked_docset = formatted_docset.merge(merger=merger)\n",
    "\n",
    "model_name = \"thenlper/gte-small\"\n",
    "\n",
    "embedded_docset = (\n",
    "    chunked_docset.spread_properties([\"entity\", \"path\"])\n",
    "    .explode()\n",
    "    .embed(\n",
    "        embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name)\n",
    "    )\n",
    ")\n",
    "\n",
    "embedded_docset = embedded_docset.materialize(\n",
    "    path=\"./elasticsearch-tutorial/embedded-docset\",\n",
    "    source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
    ")\n",
    "embedded_docset.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9321d7e-e812-41ac-8030-3db80c2147ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write to a persistent Elasticsearch Index. Note: You must have a specified elasticsearch instance running for this to work.\n",
    "# For more information on how to set one up, refer to https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html\n",
    "\n",
    "url = \"http://localhost:9200\"\n",
    "index_name = \"aryn-demo\"\n",
    "embedded_ds.write.elasticsearch(\n",
    "    url=url,\n",
    "    index_name=index_name,\n",
    "    es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n",
    "    mappings={\n",
    "        \"properties\": {\n",
    "            \"embeddings\": {\n",
    "                \"type\": \"dense_vector\",\n",
    "                \"dims\": dimensions,\n",
    "                \"index\": True,\n",
    "                \"similarity\": \"cosine\",\n",
    "            },\n",
    "            \"properties\": {\"type\": \"object\"},\n",
    "        }\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52970be4-7bac-455b-bcd0-868130ac61fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Verify data has been loaded using DocSet Query to retrieve chunks\n",
    "query_params = {\"match_all\": {}}\n",
    "query_docs = ctx.read.elasticsearch(\n",
    "    url=url,\n",
    "    index_name=index_name,\n",
    "    query=query_params,\n",
    "    es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n",
    ")\n",
    "query_docs.show(show_embedding=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
